{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a2e0aa59",
   "metadata": {},
   "source": [
    "# VoxCeleb1 --- Stats"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7f18d217",
   "metadata": {},
   "source": [
    "|                    | Dev     | Test   | All      |\n",
    "| ---                | ---     | ---    | ---      |\n",
    "| Number of samples  | 148,642 |  4,874 |  153,516 |\n",
    "| Number of speakers |   1,211 |     40 |    1,251 |"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6bcb8d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import os\n",
    "import sys\n",
    "os.chdir('../..')\n",
    "sys.path.insert(1, os.path.join(sys.path[0], '../..'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1155a636",
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "import soundfile as sf\n",
    "\n",
    "from plotnine import *\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09f36c9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "DATASET_NAME = 'VoxCeleb1'\n",
    "DATASET_PATH = 'data/voxceleb1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a969e860",
   "metadata": {},
   "outputs": [],
   "source": [
    "files = glob(f'{DATASET_PATH}/**/**/*.wav')\n",
    "speakers = glob(f'{DATASET_PATH}/*')\n",
    "\n",
    "print(f'Number of samples: {len(files)}')\n",
    "print(f'Number of speakers: {len(speakers)}')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5023fdf0",
   "metadata": {},
   "source": [
    "## Length distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a83712a",
   "metadata": {},
   "outputs": [],
   "source": [
    "lengths = []\n",
    "\n",
    "for file in tqdm(files):\n",
    "    audio, sr = sf.read(file, dtype='int16')\n",
    "    lengths.append(len(audio))\n",
    "\n",
    "df_length = pd.DataFrame({'Length': lengths})\n",
    "df_length['Length'] = df_length['Length'] / 16000\n",
    "df_length = df_length.drop(df_length[df_length['Length'] > 20].index)\n",
    "\n",
    "plot = (\n",
    "    ggplot()\n",
    "    + xlab('Length (s)')\n",
    "    + ylab('Count')\n",
    "    + ggtitle(f'Lengths distribution of {DATASET_NAME} samples')\n",
    "    + theme_bw()\n",
    "    + theme(figure_size=(10, 6), text=element_text(size=10))\n",
    "    + geom_histogram(\n",
    "        df_length,\n",
    "        aes(x='Length'),\n",
    "        binwidth=1,\n",
    "        color='black',\n",
    "        position='identity',\n",
    "        size=0.25\n",
    "    )\n",
    "    + scale_x_continuous(breaks=list(range(4, 21, 1)))\n",
    ")\n",
    "\n",
    "plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "233afdb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "100 * len(df_length[df_length['Length'] <= 4]) / len(df_length)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33b63461",
   "metadata": {},
   "outputs": [],
   "source": [
    "100 * len(df_length[df_length['Length'] > 10]) / len(df_length)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "22095eff",
   "metadata": {},
   "source": [
    "## Voice Activity Detection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4e78bf8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sslsv.preprocessing.VAD import VAD\n",
    "\n",
    "vad = VAD(threshold=-20)\n",
    "\n",
    "vads = []\n",
    "for file in tqdm(files):\n",
    "    audio, sr = sf.read(file, dtype='int16')\n",
    "    _ = vad.apply(audio)\n",
    "    vads.append(vad.last_vad.mean())\n",
    "\n",
    "df_vad = pd.DataFrame({'VAD': vads})\n",
    "df_vad['VAD'] = df_vad['VAD'] * 100\n",
    "\n",
    "plot = (\n",
    "    ggplot()\n",
    "    + xlab('% of voiced frames')\n",
    "    + ylab('Count')\n",
    "    + ggtitle(f'Distribution of voices frames from {DATASET_NAME} samples')\n",
    "    + theme_bw()\n",
    "    + theme(figure_size=(10, 6), text=element_text(size=10))\n",
    "    + geom_histogram(\n",
    "        df_vad,\n",
    "        aes(x='VAD'),\n",
    "        binwidth=3,\n",
    "        color='black',\n",
    "        position='identity',\n",
    "        size=0.25\n",
    "    )\n",
    ")\n",
    "\n",
    "plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6964b914",
   "metadata": {},
   "outputs": [],
   "source": [
    "100 * len(df_vad[df_vad['VAD'] >= 75]) / len(df_vad)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8010ec71",
   "metadata": {},
   "outputs": [],
   "source": [
    "100 * len(df_vad[df_vad['VAD'] >= 50]) / len(df_vad)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "307f0fe1",
   "metadata": {},
   "outputs": [],
   "source": [
    "100 * len(df_vad[df_vad['VAD'] <= 25]) / len(df_vad)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
